import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
# Visualisation libraries
## Progress Bar
import progressbar
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
![]()
In this article, we use an IMDB movie dataset from Kaggle.com. We will develop a model for recommending similar movies to a given movie.
Data = pd.read_csv('movie_metadata/movie_metadata.csv')
# column names
Data.columns = [x.title().replace('Imdb', 'IMDB') for x in Data.columns.tolist()]
Temp = sorted(Data.columns)
Temp.remove('Movie_Title')
Temp.insert(0, 'Movie_Title')
Data = Data.reindex(columns= Temp)
del Temp
Data.head().style.hide_index()
| Movie_Title | Actor_1_Facebook_Likes | Actor_1_Name | Actor_2_Facebook_Likes | Actor_2_Name | Actor_3_Facebook_Likes | Actor_3_Name | Aspect_Ratio | Budget | Cast_Total_Facebook_Likes | Color | Content_Rating | Country | Director_Facebook_Likes | Director_Name | Duration | Facenumber_In_Poster | Genres | Gross | IMDB_Score | Language | Movie_Facebook_Likes | Movie_IMDB_Link | Num_Critic_For_Reviews | Num_User_For_Reviews | Num_Voted_Users | Plot_Keywords | Title_Year |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Avatar | 1000.000000 | CCH Pounder | 936.000000 | Joel David Moore | 855.000000 | Wes Studi | 1.780000 | 237000000.000000 | 4834 | Color | PG-13 | USA | 0.000000 | James Cameron | 178.000000 | 0.000000 | Action|Adventure|Fantasy|Sci-Fi | 760505847.000000 | 7.900000 | English | 33000 | http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1 | 723.000000 | 3054.000000 | 886204 | avatar|future|marine|native|paraplegic | 2009.000000 |
| Pirates of the Caribbean: At World's End | 40000.000000 | Johnny Depp | 5000.000000 | Orlando Bloom | 1000.000000 | Jack Davenport | 2.350000 | 300000000.000000 | 48350 | Color | PG-13 | USA | 563.000000 | Gore Verbinski | 169.000000 | 0.000000 | Action|Adventure|Fantasy | 309404152.000000 | 7.100000 | English | 0 | http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1 | 302.000000 | 1238.000000 | 471220 | goddess|marriage ceremony|marriage proposal|pirate|singapore | 2007.000000 |
| Spectre | 11000.000000 | Christoph Waltz | 393.000000 | Rory Kinnear | 161.000000 | Stephanie Sigman | 2.350000 | 245000000.000000 | 11700 | Color | PG-13 | UK | 0.000000 | Sam Mendes | 148.000000 | 1.000000 | Action|Adventure|Thriller | 200074175.000000 | 6.800000 | English | 85000 | http://www.imdb.com/title/tt2379713/?ref_=fn_tt_tt_1 | 602.000000 | 994.000000 | 275868 | bomb|espionage|sequel|spy|terrorist | 2015.000000 |
| The Dark Knight Rises | 27000.000000 | Tom Hardy | 23000.000000 | Christian Bale | 23000.000000 | Joseph Gordon-Levitt | 2.350000 | 250000000.000000 | 106759 | Color | PG-13 | USA | 22000.000000 | Christopher Nolan | 164.000000 | 0.000000 | Action|Thriller | 448130642.000000 | 8.500000 | English | 164000 | http://www.imdb.com/title/tt1345836/?ref_=fn_tt_tt_1 | 813.000000 | 2701.000000 | 1144337 | deception|imprisonment|lawlessness|police officer|terrorist plot | 2012.000000 |
| Star Wars: Episode VII - The Force Awakens | 131.000000 | Doug Walker | 12.000000 | Rob Walker | nan | nan | nan | nan | 143 | nan | nan | nan | 131.000000 | Doug Walker | nan | 0.000000 | Documentary | nan | 7.100000 | nan | 0 | http://www.imdb.com/title/tt5289954/?ref_=fn_tt_tt_1 | nan | nan | 8 | nan | nan |
Note that
def List_Print(Text, List, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + '%s:' % Text + Style.RESET_ALL + ' %s' % ', '.join(List))
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
List_Print('Columns', Data.columns)
Columns: Movie_Title, Actor_1_Facebook_Likes, Actor_1_Name, Actor_2_Facebook_Likes, Actor_2_Name, Actor_3_Facebook_Likes, Actor_3_Name, Aspect_Ratio, Budget, Cast_Total_Facebook_Likes, Color, Content_Rating, Country, Director_Facebook_Likes, Director_Name, Duration, Facenumber_In_Poster, Genres, Gross, IMDB_Score, Language, Movie_Facebook_Likes, Movie_IMDB_Link, Num_Critic_For_Reviews, Num_User_For_Reviews, Num_Voted_Users, Plot_Keywords, Title_Year
The following three columns are unnecessary for our study. Thus, we are going to remove these columns.
Facenumber_In_PosterMovie_IMDB_LinkAspect_RatiooDrop_column_list = ['Facenumber_In_Poster', 'Movie_IMDB_Link', 'Aspect_Ratio']
display(Data[Drop_column_list].head())
| Facenumber_In_Poster | Movie_IMDB_Link | Aspect_Ratio | |
|---|---|---|---|
| 0 | 0.0 | http://www.imdb.com/title/tt0499549/?ref_=fn_t... | 1.78 |
| 1 | 0.0 | http://www.imdb.com/title/tt0449088/?ref_=fn_t... | 2.35 |
| 2 | 1.0 | http://www.imdb.com/title/tt2379713/?ref_=fn_t... | 2.35 |
| 3 | 0.0 | http://www.imdb.com/title/tt1345836/?ref_=fn_t... | 2.35 |
| 4 | 0.0 | http://www.imdb.com/title/tt5289954/?ref_=fn_t... | NaN |
Thus, these columns are dropped from our data
Data.drop(Drop_column_list, axis=1, inplace=True)
del Drop_column_list
First off, note that
Data_Duplicated = Data[Data.Movie_Title.duplicated(keep='last')]
pd.DataFrame(Data_Duplicated.Movie_Title)
| Movie_Title | |
|---|---|
| 6 | Spider-Man 3 |
| 17 | The Avengers |
| 25 | King Kong |
| 30 | Skyfall |
| 33 | Alice in Wonderland |
| ... | ... |
| 3800 | Saving Grace |
| 3887 | Night of the Living Dead |
| 3984 | The Full Monty |
| 4223 | The Calling |
| 4949 | A Dog's Breakfast |
126 rows × 1 columns
These values are duplicated!
print("There are %i unique rows from %i."% (len(Data.Movie_Title.unique()), len(Data.Movie_Title)))
number_of_duplicated=len(Data.Movie_Title)-len(Data.Movie_Title.unique())
print("There are %i duplicated rows."% number_of_duplicated)
del number_of_duplicated, Data_Duplicated
There are 4917 unique rows from 5043. There are 126 duplicated rows.
Removing the duplicated movies.
Data.drop_duplicates(subset="Movie_Title", inplace=True)
def Data_Plot(Inp, Title = None, W = None):
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type',
text = 'Percentage',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
if not W == None:
fig.update_layout(width = W)
fig.update_traces(texttemplate= 10*' ' + '%%{text}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
if not Title == None:
fig.update_layout(title={'text': '<b>' + Title + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
return data_info
data_info = Data_Plot(Data, Title = 'Movie Recommendation System', W = 980)
List_Print('Columns with NaN values', data_info.loc[data_info['Number of NaN Values']>0, 'Features'].values, C = 'Red')
Columns with NaN values: Actor_1_Facebook_Likes, Actor_1_Name, Actor_2_Facebook_Likes, Actor_2_Name, Actor_3_Facebook_Likes, Actor_3_Name, Budget, Color, Content_Rating, Country, Director_Facebook_Likes, Director_Name, Duration, Gross, Language, Num_Critic_For_Reviews, Num_User_For_Reviews, Plot_Keywords, Title_Year
We can just simply drop rows without director_name.
Data.drop(Data[Data.Director_Name.isna()].index, inplace=True)
We can just simply drop rows without director_name.
We can use sklearn.impute.SimpleImputer for imputation transformer for completing missing values.
imp= SimpleImputer(missing_values=np.nan, strategy='mean')
temp=imp.fit_transform(Data['Budget'].values.reshape(-1, 1))
Data['Budget']=temp
del temp, imp
For moviess color, we have
Ind = Data.Color.isna()
Data.loc[Ind, ['Movie_Title','Title_Year','Color']].sort_values(by='Title_Year', ascending=True).head()
| Movie_Title | Title_Year | Color | |
|---|---|---|---|
| 4846 | Midnight Cabaret | 1990.0 | NaN |
| 2753 | Shinjuku Incident | 2009.0 | NaN |
| 1948 | Dear John | 2010.0 | NaN |
| 3638 | Snow Flower and the Secret Fan | 2011.0 | NaN |
| 5020 | The Ridges | 2011.0 | NaN |
We can assume that these movies are all in color since the earliest movie on this list is 1990. Thus,
Data.Color.fillna('Color', inplace=True)
display(Data.loc[Ind, ['Movie_Title','Title_Year','Color']].sort_values(by='Title_Year', ascending=True).head())
| Movie_Title | Title_Year | Color | |
|---|---|---|---|
| 4846 | Midnight Cabaret | 1990.0 | Color |
| 2753 | Shinjuku Incident | 2009.0 | Color |
| 1948 | Dear John | 2010.0 | Color |
| 3638 | Snow Flower and the Secret Fan | 2011.0 | Color |
| 5020 | The Ridges | 2011.0 | Color |
As for the Language, we have,
Temp = [str(x) for x in Data.Language.unique()]
List_Print('Languages:', Temp, C = 'Yellow', T = 'Black')
Languages:: English, nan, Japanese, French, Mandarin, Aboriginal, Spanish, Filipino, Hindi, Russian, Maya, Kazakh, Telugu, Cantonese, German, Aramaic, Italian, Dutch, Dari, Hebrew, Chinese, Mongolian, Swedish, Korean, Thai, Bosnian, None, Hungarian, Portuguese, Icelandic, Danish, Arabic, Norwegian, Czech, Kannada, Zulu, Panjabi, Polish, Tamil, Dzongkha, Vietnamese, Indonesian, Urdu, Romanian, Persian, Slovenian, Greek, Swahili
We can see undesired values such as NaN and None. First, let's deal with None. We have
Data.loc[Data.Language == 'None', 'Country']
2895 Canada 3931 USA Name: Country, dtype: object
We can assume that these movies have been produced in English.
Data.loc[Data.Language.isna(), 'Country']
4 NaN 279 NaN 3086 USA 3539 USA 3869 USA 4110 USA 4409 USA 4630 USA 4810 USA 4885 USA 4958 USA Name: Country, dtype: object
Since most movies are from the USA, we can assume that these movies have been produced in English. Therefore,
Data.Language.fillna('English', inplace=True)
Data.Language.replace('None', 'English', inplace=True)
We can see that Duration of some movies are missing.
Data.loc[Data.Duration.isna(), ['Movie_Title','Duration']].head()
| Movie_Title | Duration | |
|---|---|---|
| 4 | Star Wars: Episode VII - The Force Awakens ... | NaN |
| 199 | Harry Potter and the Deathly Hallows: Part II | NaN |
| 206 | Harry Potter and the Deathly Hallows: Part I | NaN |
| 1510 | Black Water Transit | NaN |
| 3815 | Should've Been Romeo | NaN |
There is nothing that can be done regarding these movies and we are going to simply drop them.
Data.drop(Data.loc[Data.Duration.isna()].index, inplace=True)
display(Data.loc[Data.Title_Year.isna(), ['Movie_Title','Title_Year']])
| Movie_Title | Title_Year | |
|---|---|---|
| 279 | 10,000 B.C. | NaN |
| 2765 | Towering Inferno | NaN |
| 2870 | Del 1 - Män som hatar kvinnor | NaN |
Drop these data as well.
Data.drop(Data.loc[Data.Title_Year.isna()].index, inplace=True)
As for Country, there is only one movie with the country name.
temp=Data.loc[Data.Country.isna(),('Movie_Title','Actor_1_Name','Actor_2_Name','Actor_3_Name','Director_Name','Country')]
temp
| Movie_Title | Actor_1_Name | Actor_2_Name | Actor_3_Name | Director_Name | Country | |
|---|---|---|---|---|---|---|
| 4021 | Dawn Patrol | Chris Brochu | Jeff Fahey | Rita Wilson | Daniel Petrie Jr. | NaN |
We can search the data for the actor's names and their other movies.
Data.loc[(Data.Actor_1_Name == temp.Actor_1_Name.values[0])|
(Data.Actor_2_Name == temp.Actor_2_Name.values[0])|
(Data.Actor_3_Name == temp.Actor_3_Name.values[0]),'Country']
del temp
This movie is made in the USA and we can replace NaN with the USA.
Data.Country.fillna('USA', inplace=True)
For movie ratings, note that the ratings used since 1996 are source
| Rated | Description |
|---|---|
| G | General audiences – All ages admitted. |
| PG | Parental guidance suggested – Some material may not be suitable for children. |
| PG-13 | Parents strongly cautioned – Some material may be inappropriate for children under 13. |
| R | Restricted – Under 17 requires accompanying parent or adult guardian. |
| NC-17 | No one 17 and under admitted. |
Thus, an standard list of ratings can be found as
Ratings_Standard = np.array(['G', 'PG', 'PG-13', 'R', 'NC-17'], dtype=object)
List_Print('Ratings Standard', Ratings_Standard, C = 'Green')
Ratings Standard: G, PG, PG-13, R, NC-17
However,
List_Print('Content Rating', Data.Content_Rating.unique().astype(str), C = 'Green')
Content Rating: PG-13, PG, G, R, nan, TV-14, Not Rated, Unrated, Approved, NC-17, X, GP, Passed, M, TV-G, TV-PG
We need to convert
print(list(set(Data.Content_Rating.unique().tolist())-set(Ratings_Standard)))
[nan, 'X', 'GP', 'TV-14', 'Approved', 'Not Rated', 'M', 'TV-PG', 'Passed', 'Unrated', 'TV-G']
To the standard form. We can convert these values using the following table.
| Standard Format | Data Format |
|---|---|
| PG | G, TV-G, TV-PG, GP |
| R | M |
| Unrated | NaN,Not Rated |
| Approved | Passed |
| PG-13 | TV-14 |
| NC-17 | X |
Conversation_Map = {'PG':'PG', 'TV-G':'PG', 'GP':'PG','TV-PG':'PG',
'PG-13':'PG-13','TV-14':'PG-13',
'G':'PG', 'R':'R', 'M':'R',
np.nan:'Unrated', 'Unrated':'Unrated', 'Not Rated':'Unrated',
'Approved':'Approved', 'Passed':'Approved',
'NC-17':'NC-17', 'X':'NC-17'}
Data.content_rating = Data.Content_Rating.map(Conversation_Map)
display(Data.Content_Rating.unique())
array(['PG-13', 'PG', 'G', 'R', nan, 'TV-14', 'Not Rated', 'Unrated',
'Approved', 'NC-17', 'X', 'GP', 'Passed', 'M', 'TV-G', 'TV-PG'],
dtype=object)
As for actors,
Columns=list(Data.columns)
# Actors Column List
Actors_Column_List=list()
for i in range(len(Columns)):
if (Columns[i].find('Actor') != -1 & Columns[i].find('Likes') == -1):
Actors_Column_List.append(Columns[i])
display(Data.loc[Data.Actor_1_Name.isna(), Actors_Column_List])
| Actor_1_Name | Actor_2_Name | Actor_3_Name | |
|---|---|---|---|
| 4502 | NaN | NaN | NaN |
| 4519 | NaN | NaN | NaN |
| 4720 | NaN | NaN | NaN |
| 4837 | NaN | NaN | NaN |
| 4945 | NaN | NaN | NaN |
| 4946 | NaN | NaN | NaN |
| 4990 | NaN | NaN | NaN |
Replacing these values with 'None'.
Data.Actor_1_Name.fillna('None', inplace=True)
Data.Actor_2_Name.fillna('None', inplace=True)
Data.Actor_3_Name.fillna('None', inplace=True)
List_Print("""Columns contain 'Likes'""", Search_List('Likes', Data.columns), C = 'Green')
Columns contain 'Likes': Actor_1_Facebook_Likes, Actor_2_Facebook_Likes, Actor_3_Facebook_Likes, Cast_Total_Facebook_Likes, Director_Facebook_Likes, Movie_Facebook_Likes
Columns=list(Data.columns)
# Likes List
Likes_List=list()
# Reviews List
Reviews_List=list()
for i in range(len(Columns)):
if Columns[i].find('Likes') != -1:
Likes_List.append(Columns[i])
if Columns[i].find('Review') != -1:
Reviews_List.append(Columns[i])
if Columns[i].find('Score') != -1:
Reviews_List.append(Columns[i])
if Columns[i].find('Vote') != -1:
Reviews_List.append(Columns[i])
display(pd.DataFrame(Data[Likes_List].isna().sum(), columns=['Number of NaN']))
| Number of NaN | |
|---|---|
| Actor_1_Facebook_Likes | 7 |
| Actor_2_Facebook_Likes | 10 |
| Actor_3_Facebook_Likes | 17 |
| Cast_Total_Facebook_Likes | 0 |
| Director_Facebook_Likes | 0 |
| Movie_Facebook_Likes | 0 |
We can replace them with zero.
Data.loc[:,Likes_List].fillna(0, inplace=True)
Data.loc[:, Likes_List] = Data.loc[:, Likes_List].fillna(0)
Data.loc[:, Reviews_List] = Data.loc[:, Reviews_List].fillna(0)
del Likes_List, Reviews_List
Remaining NaN values
data_info = Data_Plot(Data, Title = 'Movie Recommendation System', W = 980)
First, let's look at the correlation plot for our data.
def Correlation_Plot (Inp, Fig_Size = 12, annot_kws = 11):
Correlation_Matrix = Inp.corr().round(2)
mask = np.zeros_like(Correlation_Matrix)
mask[np.triu_indices_from(mask)] = True
for i in range(len(mask)):
mask[i,i]=0
fig, ax = plt.subplots(figsize=(Fig_Size, Fig_Size))
sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True,
cmap =sns.color_palette("Greens", n_colors=10), linewidths = 0.2, vmin=0, vmax=1,
cbar_kws={'label': 'Correlation', "aspect":30, "shrink": .4}, annot_kws={"size": annot_kws})
return Correlation_Matrix
_ = Correlation_Plot (Data, 10)
Correlation_Matrix = Data.corr()
gross_most_corr_list=list(Correlation_Matrix.Gross[Correlation_Matrix.Gross > 0.5].index)
gross_most_corr_list.remove('Gross')
gross_most_corr_list
['Num_User_For_Reviews', 'Num_Voted_Users']
gross_null = Data[Data['Gross' ].isnull()]
display(gross_null[['Movie_Title', 'Gross']])
| Movie_Title | Gross | |
|---|---|---|
| 84 | The Lovers | NaN |
| 98 | Godzilla Resurgence | NaN |
| 242 | Asterix at the Olympic Games | NaN |
| 367 | Ben-Hur | NaN |
| 422 | All That Jazz | NaN |
| ... | ... | ... |
| 5031 | Sanctuary; Quite a Conundrum | NaN |
| 5032 | Bang | NaN |
| 5036 | The Mongol King | NaN |
| 5038 | Signed Sealed Delivered | NaN |
| 5040 | A Plague So Pleasant | NaN |
749 rows × 2 columns
and
gross_no_null = Data[~Data['Gross' ].isnull()]
display(gross_no_null[['Movie_Title', 'Gross']])
| Movie_Title | Gross | |
|---|---|---|
| 0 | Avatar | 760505847.0 |
| 1 | Pirates of the Caribbean: At World's End | 309404152.0 |
| 2 | Spectre | 200074175.0 |
| 3 | The Dark Knight Rises | 448130642.0 |
| 5 | John Carter | 73058679.0 |
| ... | ... | ... |
| 5034 | Cavite | 70071.0 |
| 5035 | El Mariachi | 2040920.0 |
| 5037 | Newlyweds | 4584.0 |
| 5041 | Shanghai Calling | 10443.0 |
| 5042 | My Date with Drew | 85222.0 |
4050 rows × 2 columns
print('It can be seen that %i entries of the gross column has NaN value.' % len(gross_null))
It can be seen that 749 entries of the gross column has NaN value.
Let's do a further test.
X = gross_no_null.loc[:, gross_most_corr_list]
y = gross_no_null.Gross
# Resampling Methods
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
# Linear Regression
reg = LinearRegression()
reg.fit(X_train, y_train)
# prediction
y_pred = reg.predict(X_test)
error = metrics.mean_absolute_error(y_test, y_pred)
print("Variance : " + str(round(metrics.r2_score(y_test, y_pred), 3)))
Variance : 0.411
Using the predicated values for gross instead of Nan values.
Data.loc[Data.Gross.isna(), 'Gross'] = reg.predict(Data.loc[Data.Gross.isna(), gross_most_corr_list])
data_info = Data_Plot(Data, Title = 'Movie Recommendation System', W = 980)
# Save as CSV
Data.to_csv('movie_metadata/clean_movie_metadata.csv', index=False)